#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys

sys.path.append('..')

sys.path.append('../..')
sys.path.append('../../..')
import traceback

from pymongo.errors import DuplicateKeyError, InvalidStringData
from bdp.i_crawler.i_downloader.ttypes import  DownLoadRsp
from i_util.tools import get_url_info
html_pattern = """
<html xmlns='http://www.w3.org/1999/xhtml'>
<head></head
><body>
<div class='Title'>
{}</div><div class='PubDate'>
{}</div><div class='Html'>
{}</div></div></div></body></html>
"""

import json

def deal_wenshu(html):
    html = html.split("jsonHtmlData = ")[1].split(";\r\n    var jsonData")[0]
    obj = json.loads(json.loads(html))
    html = html_pattern.format(obj.get('Title','').encode('utf-8'),obj.get('PubDate',''),obj.get('Html').encode('utf-8'))
    return html
import pymongo
client = pymongo.MongoClient('101.201.100.58', 27017)
db = client.get_database('crawl_merge_webpage')
collection = db['court.gov.cn']
import re
cursor = collection.find({"download_time":{'$gte':'2016-10-31'},'url':re.compile("CreateContentJS")})

if __name__ == '__main__':
    import time
    for item in cursor:
        url = item.get('url') + "format_html"
        content = item.get('content')
        try:
            content = deal_wenshu(content)
            item_new = {}
            url_info = get_url_info(url)
            item_new['url'] = url
            item_new['url_id'] = url_info.get('url_id')
            item_new['site'] = url_info.get('site')
            item_new['site_id'] = url_info.get('site_id')
            item_new['content'] = content
            item_new['download_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
            collection.insert(item_new)
        except Exception as e:
            print url, e.message
